Prediction Challenge - Germany

Author

Jonathan Kennel

Lagged linear regression model

This model can be interpreted and refined based on the responses of each regressor group. It will not be the most accurate predictor, but it is fast and can help identify the most important components. This method is similar to what is found in (Kennel 2020).

Set-up

start_time <- Sys.time()
# Load helper packages
library(data.table)
library(plotly)
library(tidymodels)
library(hydrorecipes)

# new names for predictors
nms_other <- c('datetime',
               'precipitation',
               'temperature_mean',
               'temperature_min',
               'temperature_max',
               'sea_pressure',
               'humidity',
               'wind',
               'insolation',
               'evapotranspiration')

Prepare data

outcome    <- fread('../../data/Germany/heads.csv')
predictors <- fread('../../data/Germany/input_data.csv')

# make names more verbose
setnames(outcome, c('datetime', 'wl'))
setnames(predictors, nms_other)

# join data and make a numeric time column
dat <- outcome[predictors, on = 'datetime']

# join data and make a numeric time column
dat <- outcome[predictors, on = 'datetime']

# ad hoc estimate of water deficit. Use distributed lag model for predictors
dat[, deficit := cumsum(scale(precipitation)) - cumsum(scale(evapotranspiration))]
dat[, deficit := lm(deficit~splines::ns(datetime, df = 6))$residuals]
varknots <- c(-40, 75)
nms <- paste0('deficit_', 1:(length(varknots) + 3))
dat[, c(nms) := as.data.table(splines::bs(deficit, knots = varknots))]

# ad hoc snow melt
dat[, min_temp_diff := c(0, diff(temperature_min, lag = 1))]
dat[, snow_melt := 0]
dat[min_temp_diff >= 7.1  & month(datetime) %in% c(1:3), snow_melt := 1]
dat[, snow_melt := snow_melt * precipitation]

# separate precipitation into snow and rain
dat[, precipitation_snow := 0]
dat[, precipitation_rain := 0]
dat[temperature_max <= 0, precipitation_snow := precipitation]
dat[temperature_max > 0, precipitation_rain := precipitation]

# scaled precipitation based on ET
dat[, precip_evapo := precipitation_rain * 1.0 / evapotranspiration]

# temperature changes
dat[, temperature_diff := temperature_max - temperature_mean]

# clean-up
dat[, deficit := NULL]
dat[, precipitation := NULL]

# create feature dataset
all <- recipe(wl~., dat) |>
  step_distributed_lag(precipitation_rain,    
                       knots = log_lags(45, 270)) |>
  step_distributed_lag(precipitation_snow,    
                       knots = log_lags(15, 90)) |>
  step_distributed_lag(precip_evapo,
                       knots = log_lags(30, 180)) |>
  step_distributed_lag(snow_melt,
                       knots = log_lags(30, 180)) |>
  step_distributed_lag(starts_with("deficit"),
                       knots = log_lags(51, 365 * 3.6)) |>
  step_ns(temperature_diff, deg_free = 12)|>
  step_rm(datetime) |>
  step_corr(all_predictors()) |>
  prep() |>
  bake(new_data = NULL)

setDT(all)

Fit model

fit <- lm(wl~., all)

Make predictions

dat <- cbind(dat, predict(fit, all, interval = "prediction"))

Plot results

A median filter to smooth the results as they appeared to have too much variance.

dat[, fit := runmed(fit, 5)]
p1 <- plot_ly(dat[datetime > as.POSIXct('2002-04-30', tz = 'UTC')],
              x = ~datetime, 
              y = ~wl, 
              type = "scatter", 
              mode = "lines", 
              name = "Water Level",
              line = list(color = "#808080")) |>
  add_lines(x = ~datetime, y = ~fit, name = "Predictions" ,
            line = list(color = "#6000FF60"))

p2 <- plot_ly(dat[year(datetime) > 2001], x = ~datetime, 
              y = ~wl - fit, 
              type = "scatter", 
              mode = "lines", 
              name = "Residuals",
              line = list(color = "#808080")) 
subplot(p1, p2, shareX = TRUE, nrows = 2)
sum((dat$wl-dat$fit)^2, na.rm = TRUE)
[1] 47.53229

Output submission

submission_times <- fread("submission_form_Germany.csv")
submission <- dat[datetime %in% submission_times$Date]

submission <- submission[, list(Date = datetime,
                                `Simulated Head` = fit,
                                `95% Lower Bound` = lwr,
                                `95% Upper Bound` = upr)]
fwrite(submission, "submission_form_Germany.csv")

end_time <- Sys.time()

Timings

Total elapsed time is 4.4 seconds.

References

Kennel, Jonathan. 2020. “High Frequency Water Level Responses to Natural Signals.” PhD thesis, 50 Stone Rd E, Guelph, ON N1G 2W1, Canada: University of Guelph. http://hdl.handle.net/10214/17890.

Computer and software specs

Macbook Air M1 2020

16 GB Ram

sessionInfo()
R version 4.2.1 (2022-06-23)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS Ventura 13.1

Matrix products: default
LAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] hydrorecipes_0.0.5    yardstick_1.1.0       workflowsets_1.0.0   
 [4] workflows_1.1.2       tune_1.0.1            tidyr_1.2.1          
 [7] tibble_3.1.8          rsample_1.1.1         recipes_1.0.3        
[10] purrr_1.0.0           parsnip_1.0.3         modeldata_1.0.1      
[13] infer_1.0.4           dplyr_1.0.10          dials_1.1.0          
[16] scales_1.2.1          broom_1.0.2           tidymodels_1.0.0.9000
[19] plotly_4.10.1         ggplot2_3.4.0         data.table_1.14.6    

loaded via a namespace (and not attached):
 [1] lubridate_1.9.0     DiceDesign_1.9      httr_1.4.4         
 [4] tools_4.2.1         backports_1.4.1     utf8_1.2.2         
 [7] R6_2.5.1            rpart_4.1.19        DBI_1.1.3          
[10] lazyeval_0.2.2      colorspace_2.0-3    nnet_7.3-18        
[13] withr_2.5.0         tidyselect_1.2.0    compiler_4.2.1     
[16] cli_3.5.0           stringr_1.5.0       digest_0.6.31      
[19] rmarkdown_2.19      pkgconfig_2.0.3     htmltools_0.5.4    
[22] parallelly_1.33.0   lhs_1.1.6           fastmap_1.1.0      
[25] collapse_1.8.9      htmlwidgets_1.6.0   rlang_1.0.6        
[28] rstudioapi_0.14     generics_0.1.3      jsonlite_1.8.4     
[31] crosstalk_1.2.0     magrittr_2.0.3      Matrix_1.5-3       
[34] Rcpp_1.0.9          munsell_0.5.0       fansi_1.0.3        
[37] GPfit_1.0-8         lifecycle_1.0.3     furrr_0.3.1        
[40] stringi_1.7.8       yaml_2.3.6          MASS_7.3-58.1      
[43] grid_4.2.1          parallel_4.2.1      listenv_0.9.0      
[46] lattice_0.20-45     earthtide_0.0.14    splines_4.2.1      
[49] knitr_1.41          pillar_1.8.1        fftw_1.0-7         
[52] future.apply_1.10.0 codetools_0.2-18    glue_1.6.2         
[55] evaluate_0.19       RcppParallel_5.1.5  vctrs_0.5.1        
[58] foreach_1.5.2       gtable_0.3.1        future_1.30.0      
[61] assertthat_0.2.1    xfun_0.36           gower_1.0.1        
[64] prodlim_2019.11.13  class_7.3-20        survival_3.4-0     
[67] viridisLite_0.4.1   timeDate_4021.107   iterators_1.0.14   
[70] hardhat_1.2.0       lava_1.7.0          timechange_0.1.1   
[73] globals_0.16.2      ellipsis_0.3.2      ipred_0.9-13